https://www.tidytextmining.com/ - text mining with R book

install.packages("janeaustenr")
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/janeaustenr_0.1.5.tgz'
Content type 'application/x-gzip' length 1620949 bytes (1.5 MB)
==================================================
downloaded 1.5 MB

The downloaded binary packages are in
    /var/folders/_4/6fl_4mzn4nq4f3jpfzqydbz0cql0q5/T//RtmppVA9aE/downloaded_packages
library(janeaustenr)
library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library(stringr)

Tidytext package example:

Julia Silge and David Robinson

https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html

install.packages("tidytext")
also installing the dependencies ‘SnowballC’, ‘ISOcodes’, ‘hunspell’, ‘tokenizers’, ‘stopwords’

trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/SnowballC_0.5.1.tgz'
Content type 'application/x-gzip' length 3182388 bytes (3.0 MB)
==================================================
downloaded 3.0 MB

trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/ISOcodes_2017.09.27.tgz'
Content type 'application/x-gzip' length 300128 bytes (293 KB)
==================================================
downloaded 293 KB

trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/hunspell_2.9.tgz'
Content type 'application/x-gzip' length 2107672 bytes (2.0 MB)
==================================================
downloaded 2.0 MB

trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/tokenizers_0.1.4.tgz'
Content type 'application/x-gzip' length 264190 bytes (257 KB)
==================================================
downloaded 257 KB

trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/stopwords_0.9.0.tgz'
Content type 'application/x-gzip' length 132364 bytes (129 KB)
==================================================
downloaded 129 KB

trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/tidytext_0.1.6.tgz'
Content type 'application/x-gzip' length 2787473 bytes (2.7 MB)
==================================================
downloaded 2.7 MB

The downloaded binary packages are in
    /var/folders/_4/6fl_4mzn4nq4f3jpfzqydbz0cql0q5/T//RtmppVA9aE/downloaded_packages
tidy_books <- original_books %>% 
  unnest_tokens(word, text)
tidy_books
data("stop_words")
cleaned_books <- tidy_books %>% 
  anti_join(stop_words)
Joining, by = "word"
nrcjoy <- get_sentiments("nrc") %>% 
  filter(sentiment == "joy")
tidy_books %>% 
  filter(book == "Emma") %>% 
  semi_join(nrcjoy) %>% 
  count(word, sort=TRUE)
Joining, by = "word"
library(tidyr)
library(tidyr)
bing <- get_sentiments("bing")
janeaustensentiment <- tidy_books %>%
  inner_join(bing) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
Joining, by = "word"
library(ggplot2) # plot sentiment scores accross plot trajectory of each novel
ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

# finding positive and negative words - analyze word counts that contribute to each sentiment
 bing_word_counts <- tidy_books %>% 
  inner_join(bing) %>% 
  count(word, sentiment, sort = TRUE) %>% 
  ungroup()
Joining, by = "word"
bing_word_counts
bing_word_counts %>% 
  filter(n > 150) %>% 
  mutate(n = ifelse(sentiment == "negative", -n, n)) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ylab("contribution to sentiment")

install.packages("wordcloud")
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/wordcloud_2.5.tgz'
Content type 'application/x-gzip' length 143945 bytes (140 KB)
==================================================
downloaded 140 KB

The downloaded binary packages are in
    /var/folders/_4/6fl_4mzn4nq4f3jpfzqydbz0cql0q5/T//RtmppVA9aE/downloaded_packages
library(wordcloud)
Loading required package: RColorBrewer
cleaned_books %>% 
  count(word) %>% 
  with(wordcloud(word, n, max.words = 75))

library(reshape2)

Attaching package: ‘reshape2’

The following object is masked from ‘package:tidyr’:

    smiths
tidy_books %>% 
  inner_join(bing) %>% 
  count(word, sentiment, sort = TRUE) %>% 
  acast(word ~ sentiment, value.var = "n", fill = 0) %>% 
  comparison.cloud(colors = c("#F8766D", "#00BFC4"), max.words = 75)
Joining, by = "word"

PandP_sentences$sentence[2]
[1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."
bingnegative <- get_sentiments("bing") %>%
  filter(sentiment == "negative")
wordcounts <- tidy_books %>%
  group_by(book, chapter) %>%
  summarize(words = n())
tidy_books %>%
  semi_join(bingnegative) %>%
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  top_n(1)
Joining, by = "word"
Selecting by ratio
LS0tCnRpdGxlOiAiUiBUZXh0IEFuYWx5c2lzIEV4YW1wbGUiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCgpbaHR0cHM6Ly93d3cudGlkeXRleHRtaW5pbmcuY29tL10oaHR0cHM6Ly93d3cudGlkeXRleHRtaW5pbmcuY29tLykgLSB0ZXh0IG1pbmluZyB3aXRoIFIgYm9vawoKCgoKYGBge3J9Cmluc3RhbGwucGFja2FnZXMoImphbmVhdXN0ZW5yIikKbGlicmFyeShqYW5lYXVzdGVucikKbGlicmFyeShkcGx5cikKbGlicmFyeShzdHJpbmdyKQpgYGAKCiMjIyBUaWR5dGV4dCBwYWNrYWdlIGV4YW1wbGU6Ckp1bGlhIFNpbGdlIGFuZCBEYXZpZCBSb2JpbnNvbgoKaHR0cHM6Ly9jcmFuLnItcHJvamVjdC5vcmcvd2ViL3BhY2thZ2VzL3RpZHl0ZXh0L3ZpZ25ldHRlcy90aWR5dGV4dC5odG1sCgoKCmBgYHtyfQphdXN0ZW5fYm9va3MoKQpgYGAKCgpgYGB7cn0Kb3JpZ2luYWxfYm9va3MgPC0gYXVzdGVuX2Jvb2tzKCkgJT4lIAogIGdyb3VwX2J5KGJvb2spICU+JSAKICBtdXRhdGUobGluZW51bWJlciA9IHJvd19udW1iZXIoKSwgCiAgICAgICAgICBjaGFwdGVyID0gY3Vtc3VtKHN0cl9kZXRlY3QodGV4dCwgcmVnZXgoIl5jaGFwdGVyIFtcXGRpdnhsY10iLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGlnbm9yZV9jYXNlID0gVFJVRSkpKSkgJT4lIAogIHVuZ3JvdXAoKQoKb3JpZ2luYWxfYm9va3MKYGBgCmBgYHtyfQppbnN0YWxsLnBhY2thZ2VzKCJ0aWR5dGV4dCIpCgpgYGAKYGBge3J9CmxpYnJhcnkodGlkeXRleHQpCgpgYGAKCmBgYHtyfQp0aWR5X2Jvb2tzIDwtIG9yaWdpbmFsX2Jvb2tzICU+JSAKICB1bm5lc3RfdG9rZW5zKHdvcmQsIHRleHQpCgp0aWR5X2Jvb2tzCmBgYAoKYGBge3J9CmRhdGEoInN0b3Bfd29yZHMiKQpjbGVhbmVkX2Jvb2tzIDwtIHRpZHlfYm9va3MgJT4lIAogIGFudGlfam9pbihzdG9wX3dvcmRzKQpgYGAKCmBgYHtyfQpjbGVhbmVkX2Jvb2tzICU+JSAKICBjb3VudCh3b3JkLCBzb3J0ID0gVFJVRSkKYGBgCgpgYGB7cn0KbnJjam95IDwtIGdldF9zZW50aW1lbnRzKCJucmMiKSAlPiUgCiAgZmlsdGVyKHNlbnRpbWVudCA9PSAiam95IikKCgp0aWR5X2Jvb2tzICU+JSAKICBmaWx0ZXIoYm9vayA9PSAiRW1tYSIpICU+JSAKICBzZW1pX2pvaW4obnJjam95KSAlPiUgCiAgY291bnQod29yZCwgc29ydD1UUlVFKQpgYGAKCmBgYHtyfQpsaWJyYXJ5KHRpZHlyKQpgYGAKYGBge3J9CmxpYnJhcnkodGlkeXIpCmJpbmcgPC0gZ2V0X3NlbnRpbWVudHMoImJpbmciKQoKamFuZWF1c3RlbnNlbnRpbWVudCA8LSB0aWR5X2Jvb2tzICU+JQogIGlubmVyX2pvaW4oYmluZykgJT4lCiAgY291bnQoYm9vaywgaW5kZXggPSBsaW5lbnVtYmVyICUvJSA4MCwgc2VudGltZW50KSAlPiUKICBzcHJlYWQoc2VudGltZW50LCBuLCBmaWxsID0gMCkgJT4lCiAgbXV0YXRlKHNlbnRpbWVudCA9IHBvc2l0aXZlIC0gbmVnYXRpdmUpCmBgYApgYGB7cn0KbGlicmFyeShnZ3Bsb3QyKSAjIHBsb3Qgc2VudGltZW50IHNjb3JlcyBhY2Nyb3NzIHBsb3QgdHJhamVjdG9yeSBvZiBlYWNoIG5vdmVsCgpnZ3Bsb3QoamFuZWF1c3RlbnNlbnRpbWVudCwgYWVzKGluZGV4LCBzZW50aW1lbnQsIGZpbGwgPSBib29rKSkgKwogIGdlb21fYmFyKHN0YXQgPSAiaWRlbnRpdHkiLCBzaG93LmxlZ2VuZCA9IEZBTFNFKSArCiAgZmFjZXRfd3JhcCh+Ym9vaywgbmNvbCA9IDIsIHNjYWxlcyA9ICJmcmVlX3giKQpgYGAKCgoKYGBge3J9CgojIGZpbmRpbmcgcG9zaXRpdmUgYW5kIG5lZ2F0aXZlIHdvcmRzIC0gYW5hbHl6ZSB3b3JkIGNvdW50cyB0aGF0IGNvbnRyaWJ1dGUgdG8gZWFjaCBzZW50aW1lbnQKIGJpbmdfd29yZF9jb3VudHMgPC0gdGlkeV9ib29rcyAlPiUgCiAgaW5uZXJfam9pbihiaW5nKSAlPiUgCiAgY291bnQod29yZCwgc2VudGltZW50LCBzb3J0ID0gVFJVRSkgJT4lIAogIHVuZ3JvdXAoKQoKYmluZ193b3JkX2NvdW50cwoKYGBgCmBgYHtyfQpiaW5nX3dvcmRfY291bnRzICU+JSAKICBmaWx0ZXIobiA+IDE1MCkgJT4lIAogIG11dGF0ZShuID0gaWZlbHNlKHNlbnRpbWVudCA9PSAibmVnYXRpdmUiLCAtbiwgbikpICU+JSAKICBtdXRhdGUod29yZCA9IHJlb3JkZXIod29yZCwgbikpICU+JSAKICBnZ3Bsb3QoYWVzKHdvcmQsIG4sIGZpbGwgPSBzZW50aW1lbnQpKSArCiAgZ2VvbV9iYXIoc3RhdCA9ICJpZGVudGl0eSIpICsKICB0aGVtZShheGlzLnRleHQueCA9IGVsZW1lbnRfdGV4dChhbmdsZSA9IDkwLCBoanVzdCA9IDEpKSArCiAgeWxhYigiY29udHJpYnV0aW9uIHRvIHNlbnRpbWVudCIpCmBgYAoKCmBgYHtyfQppbnN0YWxsLnBhY2thZ2VzKCJ3b3JkY2xvdWQiKQpsaWJyYXJ5KHdvcmRjbG91ZCkKYGBgCgoKYGBge3J9CmNsZWFuZWRfYm9va3MgJT4lIAogIGNvdW50KHdvcmQpICU+JSAKICB3aXRoKHdvcmRjbG91ZCh3b3JkLCBuLCBtYXgud29yZHMgPSA3NSkpCmBgYAoKCgpgYGB7cn0KbGlicmFyeShyZXNoYXBlMikKCmBgYAoKYGBge3J9CnRpZHlfYm9va3MgJT4lIAogIGlubmVyX2pvaW4oYmluZykgJT4lIAogIGNvdW50KHdvcmQsIHNlbnRpbWVudCwgc29ydCA9IFRSVUUpICU+JSAKICBhY2FzdCh3b3JkIH4gc2VudGltZW50LCB2YWx1ZS52YXIgPSAibiIsIGZpbGwgPSAwKSAlPiUgCiAgY29tcGFyaXNvbi5jbG91ZChjb2xvcnMgPSBjKCIjRjg3NjZEIiwgIiMwMEJGQzQiKSwgbWF4LndvcmRzID0gNzUpCmBgYAoKYGBge3J9ClBhbmRQX3NlbnRlbmNlcyA8LSBkYXRhLmZyYW1lKHRleHQgPSBwcmlkZXByZWp1ZGljZSkgJT4lIAogIHVubmVzdF90b2tlbnMoc2VudGVuY2UsIHRleHQsIHRva2VuID0gInNlbnRlbmNlcyIpCgpQYW5kUF9zZW50ZW5jZXMkc2VudGVuY2VbMl0KYGBgCgoKYGBge3J9ClBhbmRQX3NlbnRlbmNlcwpgYGAKCmBgYHtyfQphdXN0ZW5fY2hhcHRlcnMgPC0gYXVzdGVuX2Jvb2tzKCkgJT4lIAogIGdyb3VwX2J5KGJvb2spICU+JSAKICB1bm5lc3RfdG9rZW5zKGNoYXB0ZXIsIHRleHQsIHRva2VuID0gInJlZ2V4IiwgcGF0dGVybiA9ICJDaGFwdGVyfENIQVBURVIgW1xcZElWWExDXSIpICU+JSAKICB1bmdyb3VwKCkKICAKCmF1c3Rlbl9jaGFwdGVycwpgYGAKCmBgYHtyfQphdXN0ZW5fY2hhcHRlcnMgJT4lIAogIGdyb3VwX2J5KGJvb2spICU+JSAKICBzdW1tYXJpc2UoY2hhcHRlcnMgPSBuKCkpCmBgYAoKCgpgYGB7cn0KYmluZ25lZ2F0aXZlIDwtIGdldF9zZW50aW1lbnRzKCJiaW5nIikgJT4lCiAgZmlsdGVyKHNlbnRpbWVudCA9PSAibmVnYXRpdmUiKQoKd29yZGNvdW50cyA8LSB0aWR5X2Jvb2tzICU+JQogIGdyb3VwX2J5KGJvb2ssIGNoYXB0ZXIpICU+JQogIHN1bW1hcml6ZSh3b3JkcyA9IG4oKSkKCnRpZHlfYm9va3MgJT4lCiAgc2VtaV9qb2luKGJpbmduZWdhdGl2ZSkgJT4lCiAgZ3JvdXBfYnkoYm9vaywgY2hhcHRlcikgJT4lCiAgc3VtbWFyaXplKG5lZ2F0aXZld29yZHMgPSBuKCkpICU+JQogIGxlZnRfam9pbih3b3JkY291bnRzLCBieSA9IGMoImJvb2siLCAiY2hhcHRlciIpKSAlPiUKICBtdXRhdGUocmF0aW8gPSBuZWdhdGl2ZXdvcmRzL3dvcmRzKSAlPiUKICBmaWx0ZXIoY2hhcHRlciAhPSAwKSAlPiUKICB0b3BfbigxKQoKYGBgCgoKCgoK